" Data Analysis on Strokes: Risks Factors, Trends and Prevention Strategies " (Part 1)
Task 1
TASK 1: IMPORT THE MODULES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objs as go
from scipy.stats import chi2_contingency
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn .ensemble import VotingClassifier , BaggingClassifier , StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLars
from sklearn.linear_model import RidgeCV
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()
import warnings
warnings.filterwarnings('ignore')
print('Modules are imported.')
Modules are imported.
Task 2
TASK 2: IMPORT THE STROKE PREDICTION DATASET
stroke_data = pd.read_csv(r'C:\Users\user\Downloads\healthcare-dataset-stroke-data.csv')
Task 3
TASK 3: EXPLORE THE DATA
# Display the first few rows of the stroke_data DataFrame
stroke_data.head()
| id | gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9046 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
| 1 | 51676 | Female | 61.0 | 0 | 0 | Yes | Self-employed | Rural | 202.21 | NaN | never smoked | 1 |
| 2 | 31112 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
| 3 | 60182 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
| 4 | 1665 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
# Get the shape of the DataFrame
stroke_data.shape
(5110, 12)
# Display concise summary information about the stroke_data DataFrame
stroke_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5110 entries, 0 to 5109 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5110 non-null int64 1 gender 5110 non-null object 2 age 5110 non-null float64 3 hypertension 5110 non-null int64 4 heart_disease 5110 non-null int64 5 ever_married 5110 non-null object 6 work_type 5110 non-null object 7 Residence_type 5110 non-null object 8 avg_glucose_level 5110 non-null float64 9 bmi 4909 non-null float64 10 smoking_status 5110 non-null object 11 stroke 5110 non-null int64 dtypes: float64(3), int64(4), object(5) memory usage: 479.2+ KB
# Generate descriptive statistics of the stroke_data DataFrame
stroke_data.describe()
| id | age | hypertension | heart_disease | avg_glucose_level | bmi | stroke | |
|---|---|---|---|---|---|---|---|
| count | 5110.000000 | 5110.000000 | 5110.000000 | 5110.000000 | 5110.000000 | 4909.000000 | 5110.000000 |
| mean | 36517.829354 | 43.226614 | 0.097456 | 0.054012 | 106.147677 | 28.893237 | 0.048728 |
| std | 21161.721625 | 22.612647 | 0.296607 | 0.226063 | 45.283560 | 7.854067 | 0.215320 |
| min | 67.000000 | 0.080000 | 0.000000 | 0.000000 | 55.120000 | 10.300000 | 0.000000 |
| 25% | 17741.250000 | 25.000000 | 0.000000 | 0.000000 | 77.245000 | 23.500000 | 0.000000 |
| 50% | 36932.000000 | 45.000000 | 0.000000 | 0.000000 | 91.885000 | 28.100000 | 0.000000 |
| 75% | 54682.000000 | 61.000000 | 0.000000 | 0.000000 | 114.090000 | 33.100000 | 0.000000 |
| max | 72940.000000 | 82.000000 | 1.000000 | 1.000000 | 271.740000 | 97.600000 | 1.000000 |
The command stroke_data.describe() generates descriptive statistics for the DataFrame, providing an overview of the data's distribution. This includes metrics such as count, mean, standard deviation, minimum, 25th percentile (Q1), median (50th percentile), 75th percentile (Q3), and maximum for each numerical column. The results help to understand the central tendency, dispersion, and shape of the dataset's distribution, allowing for initial insights into the data's characteristics and potential anomalies. For example, in this dataset, the average age is approximately 43.2 years, and the average BMI is around 28.9.
Task 4
TASK 4: ANALYZE RISK FACTORS
Task 4.1: Visualize variables distributions
# Continuous variables
stroke_data.hist(bins=30, figsize=(15,10))
plt.tight_layout()
plt.show()
# Visualization and distribution of the different variables in the stroke_data dataset as histograms
# Creating an independent copy of the DataFrame stroke_data and storing it in a new variable stroke_data2
stroke_data2 = stroke_data.copy()
Task 4.2: Exploratory Data Analysis (EDA)
Task 4.2.1: Exploration and understanding of the data before embarking on more in-depth analyses
# STROKE STATUS
# Replace values in the 'stroke' column
stroke_data2['stroke'] = stroke_data2['stroke'].replace({1: 'Yes', 0: 'No'})
# Count the occurrences of different values in the 'stroke' column
stroke_counts = stroke_data2['stroke'].value_counts().reset_index()
stroke_counts.columns = ['Stroke', 'Count']
# Define custom colors
custom_colors = ['#252E6C', '#FF4500']
# Create the pie chart
fig, ax = plt.subplots()
pie = ax.pie(stroke_counts['Count'], labels=stroke_counts['Stroke'], colors=custom_colors, autopct='%1.1f%%', startangle=90, wedgeprops={'edgecolor': 'black'})
# Add a title
plt.title('Distribution of Stroke Status')
# Add a legend with specified labels and modified position
plt.legend(pie[0], ['No Stroke', 'Stroke'], loc="center left", bbox_to_anchor=(1, 0.5))
# Display the chart
plt.tight_layout()
plt.show()
The pie chart visualizes the distribution of values in the 'stroke' column. The legend indicates 'Yes' for stroke cases and 'No' for non-cases. The percentages show the proportion of each category. In this case, there are 4.9% stroke cases ('Yes') and 95.1% non-cases ('No').
# GENDER DISTRIBUTION
# Count the occurrences of different values in the 'gender' column
gender_counts = stroke_data['gender'].value_counts()
# Define custom colors
custom_colors = ['#FF69B4', '#252E6C', '#90EE90']
# Create the bar chart
fig, ax = plt.subplots()
bars = ax.bar(gender_counts.index, gender_counts.values, color=custom_colors)
# Add labels for each bar
for bar in bars:
height = bar.get_height()
ax.annotate('{}'.format(height),
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3), # 3 points above the bar
textcoords="offset points",
ha='center', va='bottom')
# Add a title
plt.title('Gender Distribution')
# Display the chart
plt.show()
The bar chart visualize the distribution of values in the 'gender' column. Each bar represents a gender category ('Female', 'Male', 'Other') with corresponding counts. The annotations on top of each bar display the exact count for that category. In this case, there are 2994 Females, 2115 Males, and 1 Other.
#STROKE CASES BY GENDER
# Count the occurrences of different values in the 'gender' and 'stroke' columns, then group them
gender_stroke_counts = stroke_data.groupby(['gender', 'stroke']).size().unstack()
# Calculate the percentage of stroke cases for each gender
gender_stroke_percentage = (gender_stroke_counts[1] / (gender_stroke_counts[0] + gender_stroke_counts[1])) * 100
# Create a pie chart with Plotly Express
fig = px.pie(names=gender_stroke_percentage.index, values=gender_stroke_percentage.values,
title="Percentage of Stroke Cases by Gender",
color_discrete_sequence=['#252E6C', '#FF69B4'])
# Update chart traces
fig.update_traces(textinfo="percent+label", pull=[0.1, 0], marker=dict(line=dict(color="white", width=2)))
# Display the chart
fig.show()
This code calculates the percentage of stroke cases for each gender and represents it in a pie chart. This helps visualize the relationship between gender and stroke cases. In this case, the result shows that 48% of stroke cases are in females and 52% in males, suggesting a relatively equal distribution of stroke cases between the two genders.
# STROKE DISTRIBUTION BY AGE
# Group the data by age and stroke status, then count the occurrences of each combination
age_stroke_counts = stroke_data2.groupby(["age", "stroke"]).size().reset_index(name="Count")
# Create a scatter plot with Plotly Express
fig_bubble_age_stroke = px.scatter(age_stroke_counts, x="age", y="Count", size="Count", color="stroke",
title="Stroke Distribution by Age",
labels={"age": "Age", "Count": "Count", "stroke": "Stroke"},
color_discrete_sequence=['#252E6C', '#FF4500'])
# Update the layout of the plot
fig_bubble_age_stroke.update_layout(xaxis_title="Age", yaxis_title="Count")
# Display the plot
fig_bubble_age_stroke.show()
The scatter plot shows the count of stroke cases for different age groups. There is a notable concentration of 'Yes' (stroke) cases among individuals aged between 60 and 80 years, suggesting that this age range may have a higher risk of experiencing a stroke
This code groups the data by age and stroke status, then counts the occurrences of each combination. The resulting scatter plot visualizes the distribution of stroke cases across different age groups. Understanding how stroke incidence varies with age is crucial for identifying age-related risk factors and informing prevention strategies.
Task 4.2.2 Perform a Student's t-test for continuous variables
# Calculate the t-statistic and p-value for age
t_stat_age, p_value_age = stats.ttest_ind(stroke_data[stroke_data['stroke'] == 1]['age'],
stroke_data[stroke_data['stroke'] == 0]['age'])
# Display the t-test results for age
print(f'T-test for age - T-statistic: {t_stat_age}, P-value: {p_value_age}')
T-test for age - T-statistic: 18.08083426887953, P-value: 7.0307775129939774e-71
This code conducts a Student's t-test to compare the ages of individuals who had a stroke (1) and those who did not (0). A very low p-value (7.03e-71) indicates a statistically significant difference between the two groups, suggesting that age is an important factor in stroke prediction in this project. A high t-statistic like 18.08 further supports this notion, indicating a significant difference in age averages between the two groups.
# Create the box plot
# Set the figure size
plt.figure(figsize=(10, 6))
# Plot the box plot
sns.boxplot(x='stroke', y='age', data=stroke_data, palette=['blue', 'red'])
# Add labels for x and y axes, and a title
plt.xlabel('Stroke')
plt.ylabel('Age')
plt.title('Box Plot of Age by Stroke Status')
# Rename x-axis ticks
plt.xticks([0, 1], ['No Stroke', 'Stroke'])
# Show the box plot
plt.show()
The creation of this boxplot allows for visualizing the age distribution based on the stroke status. By interpreting the boxplot, we observe that the median age for individuals without stroke (No Stroke) falls between 25 and 60 years, whereas for those with stroke (Stroke), it falls between 60 and 80 years. The line in the middle of each box represents the median (the central value) of the dataset for each group.
# DISTRIBUTION OF HYPERTENSION
# Replace values in the 'hypertension' column
stroke_data2['hypertension'] = stroke_data2['hypertension'].replace({1: 'Yes', 0: 'No'})
# Count the occurrences of different values in the 'hypertension' column
hypertension_count = stroke_data2['hypertension'].value_counts()
# Define custom colors
custom_colors = ['#252E6C', '#90EE90']
# Create a pie chart with Plotly Express
fig = px.pie(
values=hypertension_count, # Values to be represented
names=hypertension_count.index, # Names of the categories
hole=0.3, # Size of the hole in the center of the chart (0 for no hole)
title='Distribution of Patients with and without Hypertension', # Chart title
color_discrete_sequence=custom_colors, # Custom color sequence
)
# Update chart traces to include text information about percentages and labels,
# and pull one of the sectors to highlight it
fig.update_traces(textinfo='percent+label', pull=[0, 0.1])
# Display the chart
fig.show()
This diagram allows for visualizing the distribution of patients with or without hypertension using a pie chart. In the context of the project, it helps to understand the prevalence of hypertension among the dataset patients. Interpreting the chart, we observe that 9.75% of patients have hypertension ("Yes"), while 90.3% do not ("No"). This suggests that hypertension is relatively uncommon in the dataset compared to patients without hypertension.
# Calculate the stroke rate with and without hypertension
stroke_rate_with_hypertension = (stroke_data[stroke_data['hypertension'] == 1]['stroke'].mean()) * 100
stroke_rate_without_hypertension = (stroke_data[stroke_data['hypertension'] == 0]['stroke'].mean()) * 100
# Create a DataFrame containing the data
data = pd.DataFrame({'Hypertension': ['With Hypertension', 'Without Hypertension'],
'Stroke Rate': [stroke_rate_with_hypertension, stroke_rate_without_hypertension]})
# Create a bar chart with Plotly Express
fig = px.bar(data, x='Hypertension', y='Stroke Rate',
text='Stroke Rate', title='Stroke Rate by Hypertension',
labels={'Hypertension': 'Hypertension Status', 'Stroke Rate': 'Stroke Rate (%)'})
# Update the colors of the bars
fig.update_traces(marker_color=["#123F6A", "#89AED2"])
# Update the position and format of the label text
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
# Display the chart
fig.show()
This diagram allows for comparing the stroke rate between patients with and without hypertension, a known risk factor for strokes. Interpreting the results, we observe a higher stroke rate among patients with hypertension (13.25%) compared to those without hypertension (3.97%). This suggests that hypertension is associated with an increased risk of stroke in this dataset, underscoring the importance of hypertension management in stroke prevention.
# Create a contingency table for hypertension and stroke status
contingency_table = pd.crosstab(stroke_data['hypertension'], stroke_data['stroke'])
# Perform the chi-square test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Display the results of the chi-square test
print('Chi-square test for hypertension - Chi2:', chi2, ', P-value:', p)
Chi-square test for hypertension - Chi2: 81.6053682482931 , P-value: 1.661621901511823e-19
This chi-square test on the contingency table for hypertension and stroke status evaluates whether there's a significant association between these two variables in the dataset. Interpreting the results, we observe a chi-square value of 81.61 and a very low p-value (1.66e-19), indicating a significant association between hypertension and stroke status. This suggests that hypertension is an important risk factor for strokes in this dataset, reinforcing the need to monitor and manage hypertension as a stroke prevention strategy.
# DISTRIBUTION OF HEART DISEASE BY STROKE
# Replace values in the 'heart_disease' column
stroke_data2['heart_disease'] = stroke_data2['heart_disease'].replace({1: 'Yes', 0: 'No'})
# Separate data for patients who had a stroke and those who did not
heart_disease_yes = stroke_data2[stroke_data2["stroke"] == "Yes"]["heart_disease"]
heart_disease_no = stroke_data2[stroke_data2["stroke"] == "No"]["heart_disease"]
# Prepare data for the histogram
hist_data_heart_disease = [heart_disease_yes, heart_disease_no]
group_labels_heart_disease = ["Stroke: Yes", "Stroke: No"]
colors_heart_disease = ['#FF4500', '#252E6C']
# Create a Figure object (fig_heart_disease) with Plotly
fig_heart_disease = go.Figure()
# Add histograms for both groups (with and without stroke)
for i in range(2):
fig_heart_disease.add_trace(go.Histogram(x=hist_data_heart_disease[i], nbinsx=10, opacity=0.7,
name=group_labels_heart_disease[i], marker_color=colors_heart_disease[i]))
# Update the layout of the chart (title, axis titles)
fig_heart_disease.update_layout(title="Distribution of Heart Disease by Stroke",
xaxis_title="Heart Disease", yaxis_title="Frequency")
# Display the chart
fig_heart_disease.show()
This diagram is useful for analyzing the distribution of heart disease according to stroke status, enabling us to assess the association between heart disease and stroke. Interpreting the results, we observe that among patients who have had a stroke (stroke yes), 47 have heart disease (heart disease yes) and 202 do not (heart disease no). Among stroke no patients, 229 had heart disease and 4632 did not.
These results indicate that, although the majority of stroke-free patients do not have heart disease (4632), a significant proportion of stroke patients also have heart disease (47). This suggests that heart disease may be a risk factor for stroke, reinforcing the importance of monitoring and managing heart health as a stroke prevention strategy.
# Create a contingency table for heart diseases and stroke status
contingency_table = pd.crosstab(stroke_data['heart_disease'], stroke_data['stroke'])
# Perform the chi-square test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Display the results of the chi-square test
print('Chi-square test for heart diseases - Chi2:', chi2, ', P-value:', p)
Chi-square test for heart diseases - Chi2: 90.25956125843324 , P-value: 2.0887845685229236e-21
This chi-square test evaluates the association between heart disease and stroke status. The high chi-square value (90.26) and extremely low p-value (2.09e-21) indicate a significant association between heart disease and stroke. This suggests that heart disease is an important risk factor for strokes, emphasizing the need for heart disease management in stroke prevention strategies.
# STROKE RATE BY MARITAL STATUS
# Group the data by marital status and count the occurrences of each combination with or without stroke
attrition_by_ever_married = stroke_data2.groupby("ever_married")["stroke"].value_counts(normalize=True).unstack().reset_index()
# Define custom colors
custom_colors = ['#252E6C', '#010408']
# Create a pie chart with Plotly Express
fig_pie_ever_married = px.pie(attrition_by_ever_married, values="Yes", names="ever_married", hole=0.4,
title="Stroke Rate by Marital Status",
labels={"ever_married": "Ever Married"},
color_discrete_map={'Yes': custom_colors[0], 'No': custom_colors[1]})
# Update chart traces to include text information about percentages and labels,
# and pull one of the sectors to highlight it
fig_pie_ever_married.update_traces(textinfo="percent+label", pull=[0.1, 0.1], showlegend=False)
# Update the layout of the chart to include an annotation indicating the title of the chart
fig_pie_ever_married.update_layout(annotations=[dict(text="Stroke Rate", x=0.5, y=0.5, font_size=20, showarrow=False)])
# Display the chart
fig_pie_ever_married.show()
This pie chart visualizes the stroke rate based on marital status, helping to analyze if being married impacts the likelihood of having a stroke. Interpreting the results, we see that 79.9% of stroke cases are among those who have been married ("Yes"), while 20.1% are among those who have never been married ("No"). This suggests a potential correlation between marital status and stroke risk, which could be significant for developing targeted prevention strategies.
# Create a contingency table for marital status and stroke status
contingency_table = pd.crosstab(stroke_data['ever_married'], stroke_data['stroke'])
# Perform the chi-square test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Display the results of the chi-square test
print('Chi-square test for marital status - Chi2:', chi2, ', P-value:', p)
Chi-square test for marital status - Chi2: 58.923890259034195 , P-value: 1.6389021142314745e-14
This chi-square test evaluates the association between marital status and stroke incidence. The results show a Chi2 value of 58.92 and a very low p-value (1.64e-14), indicating a significant relationship between being married and the likelihood of having a stroke. This finding suggests that marital status is an important factor to consider in stroke risk analysis and prevention strategies.
# STROKE RATE BY WORK TYPE
# Group the data by work type and calculate the average stroke rate for each group
work_stroke_rates = stroke_data.groupby("work_type")["stroke"].mean().reset_index()
# Define the custom color for the bars in the chart
colors = ['#006400']
# Create a bar chart with Plotly Express
fig = px.bar(work_stroke_rates, x="work_type", y="stroke",
title="Stroke Rate by Work Type",
labels={"work_type": "Work Type", "stroke": "Stroke Rate"},
color_discrete_sequence=colors)
# Update the layout of the chart with axis titles
fig.update_layout(xaxis_title="Work Type", yaxis_title="Stroke Rate")
# Display the chart
fig.show()
This bar chart identifies stroke risk by work type, showing higher rates for self-employed individuals (7.9%) compared to government and private sector employees (5%). The lowest rate is among children (0.3%). These insights highlight the need for targeted prevention strategies, such as stress management programs for the self-employed and workplace wellness initiatives for other at-risk groups.
# Create a contingency table for work type and stroke status
contingency_table = pd.crosstab(stroke_data['work_type'], stroke_data['stroke'])
# Perform the chi-square test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Display the results of the chi-square test
print('Chi-square test for work type - Chi2:', chi2, ', P-value:', p)
Chi-square test for work type - Chi2: 49.163511976675295 , P-value: 5.397707801896119e-10
This chi-square test shows a significant association between work type and stroke incidence (Chi2: 49.16, P-value: 5.40e-10). This indicates that certain occupations, like self-employed, have a higher stroke risk, highlighting the need for targeted prevention strategies and workplace health programs.
# STROKE STATUS BY RESIDENCE TYPE
# Group the data by residence type and stroke status, then count the occurrences
residence_stroke_counts = stroke_data2.groupby(['Residence_type', 'stroke']).size().unstack().reset_index()
# Rename the columns for clarity
residence_stroke_counts.columns = ['Residence_type', 'No', 'Yes']
# Define custom colors for the bars
custom_colors = ['#252E6C', '#FF4500']
# Create a stacked bar chart with Plotly Express
fig = px.bar(residence_stroke_counts, x='Residence_type', y=['No', 'Yes'],
title='Distribution of Stroke Status by Residence Type',
color_discrete_sequence=custom_colors,
labels={'variable': 'Stroke', 'value': 'Count'})
# Update the x-axis labels to display the column names "Rural" and "Urban"
fig.update_xaxes(tickvals=[0, 1], ticktext=['Rural', 'Urban'])
# Update the layout of the chart to specify axis ranges
fig.update_layout(
yaxis=dict(range=[0, residence_stroke_counts[['No', 'Yes']].values.max() * 1.1]) # Start at 0 on the y-axis
)
# Display the chart
fig.show()
This stacked bar chart shows stroke incidence by residence type, revealing slightly higher stroke cases in urban areas (135) compared to rural areas (114). Both areas have a similar number of non-stroke cases. These findings suggest the need for targeted prevention strategies addressing urban-specific risk factors and improving healthcare access in rural areas.
# Create a contingency table for residence type and stroke status
contingency_table = pd.crosstab(stroke_data['Residence_type'], stroke_data['stroke'])
# Perform the chi-square test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Display the results of the chi-square test
print('Chi-square test for Residence type - Chi2:', chi2, ', P-value:', p)
Chi-square test for Residence type - Chi2: 1.0816367471627524 , P-value: 0.29833169286876987
This chi-square test shows no significant association between residence type (rural or urban) and stroke incidence (Chi2: 1.08, P-value: 0.298). This suggests that where a person lives does not significantly impact stroke risk, so prevention strategies should focus on other risk factors and ensure equal resource distribution across both areas.
# AVERAGE GLUCOSE LEVEL
# Display the minimum average glucose level
print(stroke_data['avg_glucose_level'].min())
# Display the maximum average glucose level
print(stroke_data['avg_glucose_level'].max())
# Define the boundaries for the average glucose level groups
glucose_bins = [55.12, 70, 99, 125, 271.74]
# Define the labels for the average glucose level groups
glucose_labels = ['Low', 'Normal', 'Pre-Diabetes', 'Diabetes']
# Create a new column in the DataFrame to group the average glucose levels
stroke_data2['avg_glucose_level_group'] = pd.cut(stroke_data2['avg_glucose_level'], bins=glucose_bins, labels=glucose_labels, right=False)
# Display the first few rows of the DataFrame to check the average glucose level groups
print(stroke_data2[['avg_glucose_level', 'avg_glucose_level_group']].head())
55.12 271.74 avg_glucose_level avg_glucose_level_group 0 228.69 Diabetes 1 202.21 Diabetes 2 105.92 Pre-Diabetes 3 171.23 Diabetes 4 174.12 Diabetes
This analysis categorizes glucose levels to understand their relationship with stroke risk. It shows that many stroke patients fall into the 'Diabetes' category, with glucose levels as high as 271.74, indicating a strong link between high glucose levels and stroke. This highlights the importance of managing blood sugar levels to prevent strokes.
# STROKE BY AVERAGE GLUCOSE
# Define a custom color palette
custom_palette = ['#FF4500', '#252E6C'] # Changing the colors
# Create a figure of specific size
plt.figure(figsize=(10, 6))
# Create a countplot to show the distribution of stroke cases by average glucose level group
ax = sns.countplot(data=stroke_data2, x='avg_glucose_level_group', hue='stroke', palette=custom_palette)
# Add a title to the plot
plt.title("Stroke Incidence by Average Glucose Level Group")
# Add a label to the x-axis
plt.xlabel("Average Glucose Level Group")
# Add a label to the y-axis
plt.ylabel("Count")
# Add annotations for each bar in the plot
for p in ax.patches:
height = p.get_height()
ax.annotate(f"{int(height)}", (p.get_x() + p.get_width() / 2., height), ha='center', va='bottom')
# Add a legend with a title and labels
plt.legend(title="Stroke", labels=["Yes", "No"])
# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)
# Add a grid for easier reading of values
plt.grid(True)
# Display the plot
plt.show()
This countplot shows that the highest number of stroke cases occur in the 'Diabetes' group (99 strokes), indicating a strong link between high glucose levels and stroke risk. Significant stroke cases in the 'Normal' (84) and 'Pre-Diabetes' (38) groups also suggest elevated glucose levels increase stroke risk. These insights highlight the importance of managing blood sugar levels for stroke prevention.
# Perform a Student's t-test to compare the average glucose levels between individuals who had a stroke and those who did not
# Calculate the t-statistic and the p-value
t_stat_glucose, p_value_glucose = stats.ttest_ind(
stroke_data[stroke_data['stroke'] == 1]['avg_glucose_level'], # Average glucose levels for individuals who had a stroke
stroke_data[stroke_data['stroke'] == 0]['avg_glucose_level'] # Average glucose levels for individuals who did not have a stroke
)
# Display the results of the t-test
print(f'Test t for average glucose level - t-statistic: {t_stat_glucose}, P-value: {p_value_glucose}')
Test t for average glucose level - t-statistic: 9.513352175431471, P-value: 2.7678105194741054e-21
This t-test shows a highly significant difference in average glucose levels between individuals who had a stroke and those who did not (t-statistic: 9.51, p-value: 2.77e-21). This indicates that elevated glucose levels are strongly associated with stroke risk, highlighting the importance of glucose management in stroke prevention strategies.
# Create a figure with a specific size
plt.figure(figsize=(10, 6))
# Create a box plot for the average glucose level, based on the presence or absence of a stroke
sns.boxplot(
x='stroke', # Categorical variable indicating the presence or absence of a stroke
y='avg_glucose_level', # Continuous variable indicating the average glucose level
data=stroke_data, # DataFrame containing the data
palette=['blue', 'red'] # Colors for the categories "No Stroke" and "Stroke"
)
# Add a label to the x-axis
plt.xlabel('Stroke')
# Add a label to the y-axis
plt.ylabel('Average Glucose Level')
# Add a title to the plot
plt.title('Box Plot of Average Glucose Level by Stroke Status')
# Modify the tick labels on the x-axis to indicate "No Stroke" and "Stroke"
plt.xticks([0, 1], ['No Stroke', 'Stroke'])
# Display the plot
plt.show()
This box plot shows that stroke patients have higher average glucose levels (75-200) compared to those without a stroke (75-120). This indicates a significant difference, suggesting that elevated glucose levels are a risk factor for strokes. Effective glucose management is crucial for stroke prevention.
# BMI GROUP
# Define the classification intervals for BMI
bmi_bins = [10, 18.5, 24.9, 29.9, 34.9, 39.9, 50, float('inf')]
# Define corresponding labels for each interval
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese Class I', 'Obese Class II', 'Obese Class III', 'Extreme Obesity']
# Create a new column 'bmi_group' in the DataFrame using the pd.cut function to categorize BMI values
stroke_data2['bmi_group'] = pd.cut(stroke_data2['bmi'], bins=bmi_bins, labels=bmi_labels, right=False)
# Display the minimum BMI value in the DataFrame
print(stroke_data2['bmi'].min())
# Display the maximum BMI value in the DataFrame
print(stroke_data2['bmi'].max())
# Display the first five rows of the DataFrame with the columns 'bmi' and 'bmi_group'
print(stroke_data2[['bmi', 'bmi_group']].head())
10.3
97.6
bmi bmi_group
0 36.6 Obese Class II
1 NaN NaN
2 32.5 Obese Class I
3 34.4 Obese Class I
4 24.0 Normal
This analysis categorizes BMI into groups to understand its relationship with stroke risk. Results show several individuals in higher BMI categories (e.g., Obese Class I and II), suggesting a link between high BMI and stroke risk. This highlights the need for targeted interventions to manage BMI as part of stroke prevention strategies.
# STROKE BY BMI
# Group the data by 'bmi_group' and 'stroke', then count the occurrences in each group
bmi_stroke_counts = stroke_data2.groupby(['bmi_group', 'stroke']).size().reset_index(name='Count')
# Create a bar chart with Plotly Express
fig = px.bar(
bmi_stroke_counts, # Data to use for the chart
x='bmi_group', # Column to use for the x-axis
y='Count', # Column to use for the y-axis
color='stroke', # Column to use for bar colors
title='Distribution of Strokes by BMI Group', # Chart title
labels={ # Labels to use for axes and legend
'bmi_group': 'BMI Group',
'Count': 'Count',
'stroke': 'Stroke'
},
color_discrete_sequence=['#252E6C', '#FF4500'] # Color sequence for the bars
)
# Update the layout of the chart for axis titles and legend
fig.update_layout(
xaxis_title='BMI Group', # x-axis title
yaxis_title='Count', # y-axis title
legend_title='Stroke' # Legend title
)
# Display the chart
fig.show()
This bar chart shows that higher BMI groups (Overweight, Obese Class I-III) have more stroke cases, indicating increased stroke risk. Normal and Overweight categories have the highest stroke counts. These results highlight the need for targeted stroke prevention efforts focusing on managing BMI through healthy lifestyle changes.
# Perform a t-test to compare BMI levels between groups with and without a stroke
t_stat_bmi, p_value_bmi = stats.ttest_ind(
stroke_data[stroke_data['stroke'] == 1]['bmi'], # BMI levels for individuals who had a stroke
stroke_data[stroke_data['stroke'] == 0]['bmi'] # BMI levels for individuals who did not have a stroke
)
# Display the results of the t-test
print(f'Test t for bmi - t-statistic: {t_stat_bmi}, P-value: {p_value_bmi}')
Test t for bmi - t-statistic: nan, P-value: nan
# Create the box plot for BMI by stroke status
# Set the size of the figure
plt.figure(figsize=(10, 6))
# Create the box plot with the specified colors
sns.boxplot(x='stroke', y='bmi', data=stroke_data, palette=['blue', 'red'])
plt.xlabel('Stroke') # Label the x-axis
plt.ylabel('BMI') # Label the y-axis
plt.title('Box Plot of BMI by Stroke Status') # Add a title to the plot
plt.xticks([0, 1], ['No Stroke', 'Stroke']) # Set the labels for the x-values
# Display the plot
plt.show()
This box plot shows that stroke patients generally have higher BMIs (30-35) compared to those without strokes (25-35). This suggests that higher BMI is a significant risk factor for strokes, highlighting the importance of maintaining a healthy BMI for stroke prevention.
# SMOKIG STATUS DISTRIBUTION
# Count the occurrences of each smoking status and reset the index to obtain a DataFrame
smoking_status_counts = stroke_data['smoking_status'].value_counts().reset_index()
# Rename the columns of the DataFrame for clarity
smoking_status_counts.columns = ['smoking_status', 'count']
# Define custom colors for each smoking status
custom_colors = ['#FF6347', '#4682B4', '#32CD32', '#FFD700']
# Create a bar chart with Plotly Express
fig = px.bar(
smoking_status_counts, # Data to use for the chart
x='smoking_status', # Column to use for the x-axis
y='count', # Column to use for the y-axis
title='Smoking Status Distribution', # Chart title
color='smoking_status', # Column to use for bar colors
color_discrete_sequence=custom_colors # Custom color sequence for the bars
)
# Display the chart
fig.show()
This bar chart shows the distribution of smoking statuses, with 1892 never smoked, 1544 unknown, 885 formerly smoked, and 789 currently smoke. The high number of current and former smokers (1674) highlights a significant at-risk population, emphasizing the need for targeted anti-smoking campaigns and education to reduce stroke risk. Addressing the 'Unknown' category is also important for data accuracy.
# STROKE DISTRIBUTION BY SMOKING STATUS
# Group the data by 'smoking_status' and 'stroke', then count the occurrences and rearrange the columns
smoking_stroke_counts = stroke_data2.groupby(['smoking_status', 'stroke']).size().unstack().fillna(0).reset_index()
# Define custom colors for each smoking status
custom_colors = ['#FF6347', '#32CD32', '#4682B4', '#FFD700']
# Create a pie chart with Plotly Express
fig = px.pie(
smoking_stroke_counts, # Data to use for the chart
names='smoking_status', # Column to use for sector labels
values='Yes', # Column to use for sector values
title='Stroke Distribution by Smoking Status', # Chart title
color_discrete_sequence=custom_colors # Custom color sequence for the sectors
)
# Update the chart traces to add a hole in the center and display information as percent + label
fig.update_traces(hole=0.4, textinfo='percent+label')
# Display the chart
fig.show()
This pie chart shows the distribution of stroke cases by smoking status: 36.1% never smoked, 18.9% unknown, 28.1% formerly smoked, and 16.9% currently smoke. The significant proportion of stroke cases among former smokers highlights the long-term risks of smoking, emphasizing the need for targeted cessation programs and monitoring for former smokers.
# Create a contingency table for smoking status and stroke status
contingency_table = pd.crosstab(stroke_data['smoking_status'], stroke_data['stroke'])
# Perform the chi-square test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Display the results of the chi-square test
print('Chi-square test for smoking status - Chi2:', chi2, ', P-value:', p)
Chi-square test for smoking status - Chi2: 29.147269191399264 , P-value: 2.0853997025008455e-06
This chi-square test reveals a significant association between smoking status and stroke incidence (Chi2: 29.15, P-value: 2.09e-06), indicating that smoking habits greatly impact stroke risk. This underscores the need for targeted interventions, including robust smoking cessation programs and ongoing support for current and former smokers, to effectively reduce stroke incidence.
# AGE GROUP
# Define the bins for age groups
age_bins = [0, 20, 40, 60, 80, float('inf')]
# Define the labels for each age group
age_labels = ['0-20', '21-40', '41-60', '61-80', '81+']
# Create a new column 'age_group' in the DataFrame 'stroke_data2' by categorizing ages according to the defined bins
stroke_data2['age_group'] = pd.cut(stroke_data2['age'], bins=age_bins, labels=age_labels, right=False)
# Display the first few rows of the DataFrame with the columns 'age' and 'age_group'
print(stroke_data2[['age', 'age_group']].head())
age age_group 0 67.0 61-80 1 61.0 61-80 2 80.0 81+ 3 49.0 41-60 4 79.0 61-80
Categorizing ages helps identify high-risk age groups for strokes. The data shows a significant number of individuals aged 61-80 and 81+, indicating these groups are at higher risk. This highlights the need for targeted prevention and health monitoring for older adults, as well as early detection and lifestyle modifications for middle-aged adults to prevent stroke.
# PREVALENCE OF HEART DISEASE ACROSS AGE GROUP
# Set the figure size
plt.figure(figsize=(12, 6))
# Define the custom color palette
custom_palette = ['#FF4500', '#252E6C'] # Colors swapped compared to the previous code
# Plot a countplot with Seaborn
sns.countplot(x="age_group", hue="heart_disease", data=stroke_data2, palette=custom_palette)
# Add a title to the plot
plt.title("Prevalence of Heart Disease Across Age Groups") # Title modified
# Add labels for the x-axis and y-axis
plt.xlabel("Age Group")
plt.ylabel("Count")
# Enable grid on the plot
plt.grid(True)
# Change the order of labels in the legend
plt.legend(title="Heart Disease", labels=["Yes", "No"])
# Rotate the labels on the x-axis for better readability
plt.xticks(rotation=45)
# Show the plot
plt.show()
This countplot shows that heart disease prevalence increases with age, particularly affecting those aged 41 and above. Younger groups (0-40) have almost no heart disease, while older groups (41-80+) show significant cases. These insights highlight the need for targeted cardiovascular health monitoring and interventions for older adults to reduce stroke risk.
# AGE BY HYPERTENSION
# Replace the values in the 'hypertension' column with strings
stroke_data['hypertension'] = stroke_data['hypertension'].replace({1: 'Yes', 0: 'No'})
# Create a histogram with Plotly Express
fig = px.histogram(
stroke_data, # Use the data from stroke_data
x='age', # Data to be displayed on the x-axis
color='hypertension', # Color the bars based on the 'hypertension' column
barmode='overlay', # Overlay the bars
title='Distribution of Age by Hypertension Status', # Title of the plot (modified)
labels={'hypertension': 'Hypertension', 'age': 'Age'}, # Rename the axis labels
nbins=30, # Number of bins for the histogram
color_discrete_map={'Yes': '#252E6C', 'No': '#BC3030'} # Custom colors for each category of 'hypertension'
)
# Show the plot
fig.show()
This histogram shows that hypertension prevalence increases significantly with age, particularly from age 40 onwards. Since hypertension is a major risk factor for stroke, this highlights the need for targeted blood pressure monitoring and management in middle-aged and older adults to reduce stroke risk. Public health campaigns and preventive measures should focus on these age groups to effectively manage hypertension and prevent strokes.
# HYPERTENSION BY SMOKING STATUS
# Filter the data to keep only cases where hypertension is "Yes"
yes_hypertension_stroke_data = stroke_data2[stroke_data2['hypertension'] == 'Yes']
# Count the occurrences of each value of smoking_status
hypertension_by_smoking = yes_hypertension_stroke_data['smoking_status'].value_counts().reset_index()
# Rename the columns of the resulting DataFrame
hypertension_by_smoking.columns = ['Smoking_Status', 'Count']
# Define a custom color palette for the plot
custom_colors = ['#FF6347', '#32CD32', '#FFD700', '#4682B4']
# Create a pie chart with Plotly Express
fig = px.pie(
hypertension_by_smoking, # Use the data from hypertension_by_smoking
names='Smoking_Status', # Names of the pie chart slices are determined by the values in the 'Smoking_Status' column
values='Count', # Values of the pie chart slices are determined by the values in the 'Count' column
title="Hypertension Cases by Smoking Status", # Title of the plot (modified)
color_discrete_sequence=custom_colors, # Use the custom color palette
labels={'Smoking_Status': 'Smoking Status', 'Count': 'Count'} # Rename the axis labels
)
# Show the plot
fig.show()
This pie chart shows that 46.6% of hypertension cases are among non-smokers, while 24.1% are former smokers and 18.9% are current smokers. This indicates that smoking significantly contributes to hypertension, but other factors are also important. Comprehensive stroke prevention strategies should address both smoking-related and other risk factors, and enhance data collection for accuracy.
# Count the occurrences of each value in the 'smoking_status' column
stroke_data2['smoking_status'].value_counts()
smoking_status never smoked 1892 Unknown 1544 formerly smoked 885 smokes 789 Name: count, dtype: int64
Counting smoking statuses reveals that 1892 never smoked, 1544 are unknown, 885 formerly smoked, and 789 currently smoke. This helps identify high-risk groups for targeted stroke prevention. Strengthening smoking cessation programs and improving data accuracy, especially for the 'Unknown' category, are crucial for effective stroke prevention strategies.
# BMI AND AVERAGE GLUCOSE BY STROKE
# Define a custom color map for the values in the 'stroke' column
color_discrete_map_inverted = {'Yes': '#FF4500', 'No': '#252E6C'}
# Create a scatter plot with Plotly Express
fig = px.scatter(
stroke_data, # Use the data from stroke_data
x='bmi', # The x-axis values are defined by the 'bmi' column
y='avg_glucose_level', # The y-axis values are defined by the 'avg_glucose_level' column
color='stroke', # Points are colored based on the values in the 'stroke' column
title='BMI and Average Glucose Level vs. Stroke', # Title of the plot
labels={'bmi': 'BMI', 'avg_glucose_level': 'Average Glucose Level', 'stroke': 'Stroke'}, # Rename axis labels
color_discrete_map=color_discrete_map_inverted, # Use the custom color map
width=1000, # Set the width of the plot
height=600 # Set the height of the plot
)
# Update the layout of the plot to specify the legend font size
fig.update_layout(
legend=dict(
font=dict(
size=12, # Legend font size
)
)
)
# Show the plot
fig.show()
This scatter plot shows that higher glucose levels and a wide range of BMI values are associated with stroke cases (yellow dots). Stroke cases are more prevalent at glucose levels above 150. This highlights the need for integrated management of BMI and glucose levels through healthy lifestyle choices and regular monitoring to reduce stroke risk.